{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "ce1c75bf-4735-42cd-92f5-8bd846856738",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from tqdm import tqdm\n",
    "from scipy.stats import pearsonr, spearmanr\n",
    "import scipy.sparse as sparse\n",
    "from scipy.stats import bernoulli, poisson\n",
    "import analysis_utils_mine as utils\n",
    "\n",
    "import json\n",
    "import pandas as pd\n",
    "import ast\n",
    "from datetime import datetime\n",
    "import torch\n",
    "import pandas as pd\n",
    "from datetime import datetime, timedelta\n",
    "import pickle\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import xlsxwriter\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "b0a0a14a-0dc7-4b21-b3ae-4e831cbe76b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "345ea2d6-5f15-425f-b5c3-ed282cc46b6a",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_ideological_topics(objective_topic_loc, \n",
    "                           objective_topic_scale,\n",
    "                           ideological_topic_loc, \n",
    "                           ideological_topic_scale,\n",
    "                           ideal_point):\n",
    "    ideological_topic_mean = np.exp(objective_topic_loc +\n",
    "                              ideal_point * ideological_topic_loc +\n",
    "                              (objective_topic_scale ** 2 + \n",
    "                               ideal_point ** 2 * \n",
    "                               ideological_topic_scale ** 2) / 2)\n",
    "    return ideological_topic_mean"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "94ff5fce-cbee-439d-aa4b-282681f6ca76",
   "metadata": {},
   "outputs": [],
   "source": [
    "def rescale_to_probs_renorm(arr):\n",
    "    return arr/arr.sum(1, keepdims=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "b858640c-3c6d-4d39-90ea-8521fc3c7eae",
   "metadata": {},
   "outputs": [],
   "source": [
    "labels_to_discard = ['DISCARD', 'Congratulations', 'Honoring Service Members', 'Member Votes', 'Memorial Speech', \n",
    "                     'Motions', 'Yielding', 'Congratulatory Messages', 'Constituent Outreach',\n",
    "                     'Dear Colleague and Newsletters', 'Live Proceedings', \n",
    "                     'Local Constituent Services', 'Social Media', 'Town Hall Meetings', 'Tributes']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "3574c947-89e5-4554-a7cf-9c50c70a0554",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_topics_file_for_human_labeling_and_rating(topic_word_minus1,\n",
    "                                                     topic_word_plus1,\n",
    "                                                     vocab,\n",
    "                                                     outpath,\n",
    "                                                     selected_topic_inds,\n",
    "                                                     selected_topic_labels,\n",
    "                                                     num_top_words = 20):\n",
    "    workbook = xlsxwriter.Workbook(outpath + 'annotation_file.xlsx')\n",
    "    workbook.formats[0].set_font_size(12)\n",
    "    worksheet = workbook.add_worksheet()\n",
    "    worksheet.freeze_panes(1, 0)\n",
    "    \n",
    "    # Add a format for the header cells.\n",
    "    header_format = workbook.add_format({\n",
    "        'bottom': 5,\n",
    "        'top':5,\n",
    "        'font_size':12,\n",
    "        #'bg_color': '#C6EFCE',\n",
    "        'bold': True,\n",
    "        'text_wrap': True,\n",
    "        #'valign': 'center',\n",
    "        'align': 'center',\n",
    "        #'indent': 1,\n",
    "    })\n",
    "    \n",
    "    n_labels = len(selected_topic_labels)\n",
    "    \n",
    "    worksheet.set_row(0, 40)\n",
    "\n",
    "    # Set up layout of the worksheet.\n",
    "    worksheet.set_column('A:A', 25)\n",
    "    worksheet.set_column('B:B', 10)\n",
    "    worksheet.set_column('C:C', 15)\n",
    "    worksheet.set_column('D:D', 25)\n",
    "    worksheet.set_column('E:E', 10)\n",
    "    worksheet.set_column('F:F', 3)\n",
    "    worksheet.set_column('G:G', 30)\n",
    "    worksheet.set_column('H:H', 30)\n",
    "    worksheet.set_column('I:I', 50)\n",
    "    worksheet.set_column('J:J', 25)\n",
    "    worksheet.set_column('K:K', 25)\n",
    "    worksheet.set_column('L:L', 150)\n",
    "    #max_rows = (num_top_words+3)*n_labels + 10\n",
    "    #print(max_rows)\n",
    "    #worksheet.set_row(0, max_rows)\n",
    "\n",
    "    # Write the header cells and some data that will be used in the examples.\n",
    "    worksheet.write('A1', \n",
    "                    '', \n",
    "                    header_format)\n",
    "    worksheet.write('B1', \n",
    "                    '', \n",
    "                    header_format)\n",
    "    \n",
    "    worksheet.write('C1', \n",
    "                    'Issue', \n",
    "                    header_format)\n",
    "    \n",
    "    worksheet.write('D1', \n",
    "                    '', \n",
    "                    header_format)\n",
    "    worksheet.write('E1', \n",
    "                    '', \n",
    "                    header_format)\n",
    "    worksheet.write('F1', \n",
    "                    '', \n",
    "                    header_format)\n",
    "    \n",
    "    worksheet.write('G1', \n",
    "                    'Label Applicability for a)', \n",
    "                    header_format)\n",
    "    \n",
    "    worksheet.write('H1', \n",
    "                    'Label Applicability for b)', \n",
    "                    header_format)\n",
    "    \n",
    "    worksheet.write('I1', \n",
    "                    'Ideological Polarization', \n",
    "                    header_format)\n",
    "    \n",
    "    worksheet.write('J1', \n",
    "                    'Ideological Position expressed in a)', \n",
    "                    header_format)\n",
    "    \n",
    "    worksheet.write('K1', \n",
    "                    'Ideological Position expressed in b)', \n",
    "                    header_format)\n",
    "    \n",
    "    worksheet.write('L1', \n",
    "                    'Notes/Comments', \n",
    "                    header_format)\n",
    "    \n",
    "    border = workbook.add_format({'top': 2,\n",
    "                                  'bottom': 2})\n",
    "    border_plus_highlighting = workbook.add_format({'top': 2,\n",
    "                                                    'bottom': 2,\n",
    "                                                    'left': 1,\n",
    "                                                    'right': 1,\n",
    "                                                    'bg_color': '#FFFFEC'})\n",
    "    \n",
    "    #issue1_row_ind = 3\n",
    "    random_seeds = [i + 1 for i in range(len(selected_topic_inds))]\n",
    "    num_topics, num_words = topic_word_minus1.shape\n",
    "    topic_ind_to_a_b_info = {}\n",
    "    for k in selected_topic_inds:\n",
    "        topic_ind_to_a_b_info[k] = {}\n",
    "    \n",
    "    on_issue = 0\n",
    "    \n",
    "    while on_issue < n_labels:\n",
    "        #print(on_issue)\n",
    "        k = selected_topic_inds[on_issue]\n",
    "        seed = random_seeds[on_issue]\n",
    "        issue_row = ((num_top_words+3)*on_issue) + 3\n",
    "        for c in ['A', 'B', 'D', 'E', 'F', 'L']:\n",
    "            worksheet.write(c + str(issue_row), '', border)\n",
    "        for c in ['G', 'H', 'I', 'J', 'K']:\n",
    "            worksheet.write(c + str(issue_row), '', border_plus_highlighting)\n",
    "            \n",
    "        worksheet.write('C' + str(issue_row), 'Issue ' + str(on_issue + 1), border)\n",
    "        \n",
    "        worksheet.write('A' + str(issue_row + 1), 'a)')\n",
    "        worksheet.set_row(issue_row, None, None, {'collapsed': True})\n",
    "        worksheet.write('D' + str(issue_row + 1), 'b)')\n",
    "        \n",
    "        a_choice = random.Random(seed).sample([-1, 1], 1)[0]\n",
    "        \n",
    "        if a_choice == -1:\n",
    "            topic_ind_to_a_b_info[k]['a'] = -1\n",
    "            topic_ind_to_a_b_info[k]['b'] = 1\n",
    "            top_word_inds_a = np.argsort(list(topic_word_minus1[k]))[::-1][:num_top_words]\n",
    "            top_words_a = [vocab[i] for i in top_word_inds_a]\n",
    "            top_word_probs_a = [topic_word_minus1[k][i] for i in top_word_inds_a]\n",
    "            top_word_inds_b = np.argsort(list(topic_word_plus1[k]))[::-1][:num_top_words]\n",
    "            top_words_b = [vocab[i] for i in top_word_inds_b]\n",
    "            top_word_probs_b = [topic_word_plus1[k][i] for i in top_word_inds_b]\n",
    "            \n",
    "            for ii in range(2, num_top_words + 2):\n",
    "                worksheet.write('A' + str(issue_row + ii), top_words_a[ii-2])\n",
    "                worksheet.write('B' + str(issue_row + ii), top_word_probs_a[ii-2])\n",
    "                \n",
    "                worksheet.write('D' + str(issue_row + ii), top_words_b[ii-2])\n",
    "                worksheet.write('E' + str(issue_row + ii), top_word_probs_b[ii-2])\n",
    "                \n",
    "                worksheet.set_row(issue_row + ii - 1, None, None, {'level': 1, 'hidden': True})\n",
    "            \n",
    "            worksheet.conditional_format('B' + str(issue_row + 2) + ':B' + str(issue_row + num_top_words + 1),\n",
    "                                         {'type': 'data_bar',\n",
    "                                          'bar_only': True,\n",
    "                                          'bar_solid': True})\n",
    "            worksheet.conditional_format('E' + str(issue_row + 2) + ':B' + str(issue_row + num_top_words + 1),\n",
    "                                         {'type': 'data_bar',\n",
    "                                          'bar_only': True,\n",
    "                                          'bar_solid': True})\n",
    "            \n",
    "            \n",
    "        elif a_choice == 1:\n",
    "            topic_ind_to_a_b_info[k]['a'] = 1\n",
    "            topic_ind_to_a_b_info[k]['b'] = -1\n",
    "            top_word_inds_b = np.argsort(list(topic_word_minus1[k]))[::-1][:num_top_words]\n",
    "            top_words_b = [vocab[i] for i in top_word_inds_b]\n",
    "            top_word_probs_b = [topic_word_minus1[k][i] for i in top_word_inds_b]\n",
    "            top_word_inds_a = np.argsort(list(topic_word_plus1[k]))[::-1][:num_top_words]\n",
    "            top_words_a = [vocab[i] for i in top_word_inds_a]\n",
    "            top_word_probs_a = [topic_word_plus1[k][i] for i in top_word_inds_a]\n",
    "            \n",
    "            for ii in range(2, num_top_words + 2):\n",
    "                worksheet.write('A' + str(issue_row + ii), top_words_a[ii-2])\n",
    "                worksheet.write('B' + str(issue_row + ii), top_word_probs_a[ii-2])\n",
    "                \n",
    "                worksheet.write('D' + str(issue_row + ii), top_words_b[ii-2])\n",
    "                worksheet.write('E' + str(issue_row + ii), top_word_probs_b[ii-2])\n",
    "                \n",
    "                worksheet.set_row(issue_row + ii - 1, None, None, {'level': 1, 'hidden': True})\n",
    "                \n",
    "            worksheet.conditional_format('B' + str(issue_row + 2) + ':B' + str(issue_row + num_top_words + 1),\n",
    "                                         {'type': 'data_bar',\n",
    "                                          'bar_only': True,\n",
    "                                          'bar_solid': True})\n",
    "            worksheet.conditional_format('E' + str(issue_row + 2) + ':B' + str(issue_row + num_top_words + 1),\n",
    "                                         {'type': 'data_bar',\n",
    "                                          'bar_only': True,\n",
    "                                          'bar_solid': True})\n",
    "        \n",
    "        worksheet.data_validation('G' + str(issue_row),\n",
    "                                  {'validate': 'list',\n",
    "                                  'source': ['a): IS about ' + selected_topic_labels[on_issue], \n",
    "                                             'a): MIGHT be about ' + selected_topic_labels[on_issue], \n",
    "                                             'a): IS NOT about ' + selected_topic_labels[on_issue]]})\n",
    "        worksheet.data_validation('H' + str(issue_row),\n",
    "                                  {'validate': 'list',\n",
    "                                  'source': ['b): IS about ' + selected_topic_labels[on_issue], \n",
    "                                             'b): MIGHT be about ' + selected_topic_labels[on_issue], \n",
    "                                             'b): IS NOT about ' + selected_topic_labels[on_issue]]})\n",
    "        \n",
    "        \n",
    "        #need to adjust for below I column because exceeding excel character limits for list validation for certain long topic names\n",
    "        \n",
    "        persp_list = ['a) and b) represent polarized perspectives on the issue: ' + selected_topic_labels[on_issue], \n",
    "                      'a) and b) represent SOMEWHAT polarized perspectives on above issue',# + selected_topic_labels[on_issue], \n",
    "                      'a) and b) DO NOT represent polarized perspectives on above issue',# + selected_topic_labels[on_issue],\n",
    "                      'Unsure']\n",
    "#         pp = 50 + (on_issue*4) + ((num_top_words+3)*n_labels)\n",
    "#         for persp in persp_list:\n",
    "#             worksheet.write(\"M{}\".format(pp), persp)\n",
    "#             pp = pp + 1\n",
    "        worksheet.data_validation('I' + str(issue_row),\n",
    "                                  {'validate': 'list',\n",
    "                                   'source': persp_list})\n",
    "                                  #'source': '=Details!$M$' + str(pp-4) + ':$M$' + str(pp-1)})\n",
    "        \n",
    "        \n",
    "        worksheet.data_validation('J' + str(issue_row),\n",
    "                                  {'validate': 'list',\n",
    "                                  'source': ['Liberal', \n",
    "                                             'Unsure', \n",
    "                                             'Conservative',\n",
    "                                             'N/A']})\n",
    "        worksheet.data_validation('K' + str(issue_row),\n",
    "                                  {'validate': 'list',\n",
    "                                  'source': ['Liberal', \n",
    "                                             'Unsure', \n",
    "                                             'Conservative',\n",
    "                                             'N/A']})\n",
    "        on_issue += 1\n",
    "        \n",
    "    workbook.close()\n",
    "    return topic_ind_to_a_b_info\n",
    "    \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "3271a8c8-01b3-4e88-a638-12503b06e7a2",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_dataframe_from_annotated_xlsx_file_path(path):\n",
    "    df = pd.read_excel(path,\n",
    "                       sheet_name=None,\n",
    "                       engine='openpyxl')\n",
    "    df = df['Sheet1']\n",
    "    df = df[~pd.isnull(df['Issue'])]\n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4349d3b8-8792-465f-bb34-2acb1f5315a2",
   "metadata": {},
   "source": [
    "### Floor speeches"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "c678f1f8-90f9-4595-8697-e67908203de5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "42\n"
     ]
    }
   ],
   "source": [
    "final_selected_speech_topic_inds = list(np.load('venue_diff_polsci/floor_speeches/post_tbip_output_labeling/final_selected_speech_topic_inds.npy',\n",
    "                                          allow_pickle=True))\n",
    "print(len(final_selected_speech_topic_inds))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "478c943d-b3fa-447b-b790-7c280f6c8aa1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# floor speeches\n",
    "\n",
    "project_dir = os.path.abspath('/workspace/pranav/tbip/data/floor_speeches_congs_115_116/') \n",
    "fit_dir = os.path.join(project_dir, \"mallet_fits_removed_procedural_speeches\")\n",
    "#source_dir = os.path.join(project_dir, \"data/synthetic\")\n",
    "\n",
    "# Load TBIP data.\n",
    "data_dir = os.path.join(project_dir, \"clean_removing_procedural\")\n",
    "(counts_speeches, vocabulary_speeches, author_indices_speeches, \n",
    " author_map_speeches) = utils.load_text_data(data_dir)\n",
    "\n",
    "# Load TBIP parameters.\n",
    "param_dir = os.path.join(project_dir, \"tbip-pytorch-fits-issue-specific-rem-procedural-speeches-k50-init-mallet/params/\")\n",
    "(_, _, objective_topic_loc_speeches, objective_topic_scale_speeches, \n",
    " ideological_topic_loc_speeches, ideological_topic_scale_speeches, ideal_point_loc_speeches, \n",
    " ideal_point_scale_speeches) = utils.load_tbip_parameters(param_dir)\n",
    "\n",
    "# Compute means from variational parameters\n",
    "#document_mean_speeches = np.exp(document_loc + document_scale ** 2 / 2)\n",
    "objective_topic_mean_speeches = np.exp(objective_topic_loc_speeches + \n",
    "                              objective_topic_scale_speeches ** 2 / 2)\n",
    "ideological_topic_mean_speeches = ideological_topic_loc_speeches\n",
    "ideal_point_mean_speeches = ideal_point_loc_speeches"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "a6b165c8-4fb5-4d25-95bb-6f6f346299f6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(50, 11433)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ideological_topic_loc_speeches.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "886f9ae2-2a14-4c58-8e22-85da11a0ead3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "48\n"
     ]
    }
   ],
   "source": [
    "topic_ind_to_ideological_multiplier = pickle.load(open('issue_specific_tbip/floor_speeches/topic_ind_to_ideological_multiplier.pkl', 'rb'))\n",
    "print(len(topic_ind_to_ideological_multiplier))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "45088a8c-d41d-413d-b402-082e6694fd00",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 1, -1, -1, 1, 1, -1, -1, -1, 1, 1, -1, -1, -1, -1, 1, 1, -1, -1, -1, 1, -1, 1, 1, 1, -1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, -1, 1, -1, -1, -1, -1, 1, 1, -1]\n"
     ]
    }
   ],
   "source": [
    "multiplier = [1 for _ in range(ideological_topic_loc_speeches.shape[0])]\n",
    "for ind in topic_ind_to_ideological_multiplier:\n",
    "    multiplier[ind] = topic_ind_to_ideological_multiplier[ind]\n",
    "print(multiplier)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "e444e5b2-38bd-4264-8e2d-6c41e1b27b67",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "50"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(multiplier)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "ec867d24-8664-4068-9515-4e005d8d1131",
   "metadata": {},
   "outputs": [],
   "source": [
    "ideological_topic_loc_speeches = ideological_topic_loc_speeches * np.array(multiplier)[:, np.newaxis]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "f2453da1-904e-4601-9e30-f45c410c6498",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(50, 11433)\n",
      "(50, 11433)\n"
     ]
    }
   ],
   "source": [
    "ideological_topic_speeches_minus1 = get_ideological_topics(objective_topic_loc_speeches, \n",
    "                                                           objective_topic_scale_speeches,\n",
    "                                                           ideological_topic_loc_speeches, \n",
    "                                                           ideological_topic_scale_speeches,\n",
    "                                                           -1.0)\n",
    "print(ideological_topic_speeches_minus1.shape)\n",
    "ideological_topic_speeches_plus1 = get_ideological_topics(objective_topic_loc_speeches, \n",
    "                                                           objective_topic_scale_speeches,\n",
    "                                                           ideological_topic_loc_speeches, \n",
    "                                                           ideological_topic_scale_speeches,\n",
    "                                                           1.0)\n",
    "print(ideological_topic_speeches_plus1.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "4f82128a-116b-4ae4-8464-b4b5888766f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "vocabulary_speeches = list(vocabulary_speeches)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "7fb1eab5-9c55-4dae-a410-8c8a0b369426",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(50, 11433)\n",
      "(50, 11433)\n"
     ]
    }
   ],
   "source": [
    "# rescaling to make them probs \n",
    "ideological_topic_speeches_minus1 = rescale_to_probs_renorm(ideological_topic_speeches_minus1)\n",
    "print(ideological_topic_speeches_minus1.shape)\n",
    "\n",
    "ideological_topic_speeches_plus1 = rescale_to_probs_renorm(ideological_topic_speeches_plus1)\n",
    "print(ideological_topic_speeches_plus1.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "78b47fe7-2acb-4e29-a41b-944dd1601e1a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 49 entries, 0 to 48\n",
      "Data columns (total 10 columns):\n",
      " #   Column                Non-Null Count  Dtype  \n",
      "---  ------                --------------  -----  \n",
      " 0   Topic                 49 non-null     object \n",
      " 1   Name 1                49 non-null     object \n",
      " 2   Description 1         48 non-null     object \n",
      " 3   Notes 1               9 non-null      object \n",
      " 4   Name 2                49 non-null     object \n",
      " 5   Description 2         42 non-null     object \n",
      " 6   Notes 2               16 non-null     object \n",
      " 7   Unnamed: 7            0 non-null      float64\n",
      " 8   Consensus Topic Name  49 non-null     object \n",
      " 9   Notes/Comments        14 non-null     object \n",
      "dtypes: float64(1), object(9)\n",
      "memory usage: 4.0+ KB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "consensus_labels_speech = pd.read_excel('venue_diff_polsci/consensus_topic_labeling_files/results/speeches_consensus_labeling.xlsx',\n",
    "                                       sheet_name=None,\n",
    "                                        engine='openpyxl')['Sheet1']\n",
    "print(consensus_labels_speech.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "8a3e1131-915b-4c9e-a89f-15b60cf8d7af",
   "metadata": {},
   "outputs": [],
   "source": [
    "speech_topics_to_labels = {}\n",
    "speech_c_ts = list(consensus_labels_speech['Topic'])\n",
    "speech_c_labels = list(consensus_labels_speech['Consensus Topic Name'])\n",
    "for t, l in zip(speech_c_ts, speech_c_labels):\n",
    "    speech_topics_to_labels[int(t.split()[1]) - 1] = l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "8aeba9c5-b4ad-47f6-9813-f4b23ea473d7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "42\n"
     ]
    }
   ],
   "source": [
    "final_selected_speech_topic_labels = [speech_topics_to_labels[t] for t in final_selected_speech_topic_inds]\n",
    "print(len(final_selected_speech_topic_labels))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "70187549-4d8e-4786-8847-52bf3e49cc00",
   "metadata": {},
   "outputs": [],
   "source": [
    "topic_ind_to_a_b_info_speech = create_topics_file_for_human_labeling_and_rating(ideological_topic_speeches_minus1,\n",
    "                                                 ideological_topic_speeches_plus1,\n",
    "                                                 vocabulary_speeches,\n",
    "                                                 '/workspace/pranav/tbip/analysis/human_annotation_files/issue_specific_tbip/floor_speeches/post_tbip_annotation/',\n",
    "                                                 final_selected_speech_topic_inds,\n",
    "                                                 final_selected_speech_topic_labels\n",
    "                                                )\n",
    "pickle.dump(topic_ind_to_a_b_info_speech,\n",
    "    open('issue_specific_tbip/floor_speeches/topic_ind_to_a_b_info_post_tbip.pkl', \n",
    "                 'wb'))\n",
    "            "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5510a631-53f4-42c0-902a-2b6818b87def",
   "metadata": {},
   "source": [
    "### Twitter tweets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "8ac7b374-2bb0-4456-a74b-35c5ed59c254",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "40\n"
     ]
    }
   ],
   "source": [
    "final_selected_tweet_topic_inds = list(np.load('venue_diff_polsci/twitter/post_tbip_output_labeling/final_selected_tweet_topic_inds.npy',\n",
    "                                          allow_pickle=True))\n",
    "print(len(final_selected_tweet_topic_inds))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "151a7096-a567-4e5a-8bfb-d9267b62b153",
   "metadata": {},
   "outputs": [],
   "source": [
    "# tweets\n",
    "\n",
    "project_dir = os.path.abspath('/workspace/pranav/tbip/data/tweets_cong_115_116/') \n",
    "fit_dir = os.path.join(project_dir, \"mallet_results/tbip_expanded_preprocessing_k50\")\n",
    "#source_dir = os.path.join(project_dir, \"data/synthetic\")\n",
    "\n",
    "# Load TBIP data.\n",
    "data_dir = os.path.join(project_dir, \"clean2\")\n",
    "(counts_tweets, vocabulary_tweets, author_indices_tweets, \n",
    " author_map_tweets) = utils.load_text_data(data_dir)\n",
    "\n",
    "# Load TBIP parameters.\n",
    "param_dir = os.path.join(project_dir, \"tbip-issue-specific-k50-expanded-vocab-with-mallet-scaled-topic/params/\")\n",
    "(_, _, objective_topic_loc_tweets, objective_topic_scale_tweets, \n",
    " ideological_topic_loc_tweets, ideological_topic_scale_tweets, ideal_point_loc_tweets, \n",
    " ideal_point_scale_tweets) = utils.load_tbip_parameters(param_dir)\n",
    "\n",
    "# Compute means from variational parameters\n",
    "#document_mean_speeches = np.exp(document_loc + document_scale ** 2 / 2)\n",
    "objective_topic_mean_tweets = np.exp(objective_topic_loc_tweets + \n",
    "                              objective_topic_scale_tweets ** 2 / 2)\n",
    "ideological_topic_mean_tweets = ideological_topic_loc_tweets\n",
    "ideal_point_mean_tweets = ideal_point_loc_tweets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "b2d58bbf-f76f-418a-bf8c-56338def5e7e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "48\n"
     ]
    }
   ],
   "source": [
    "topic_ind_to_ideological_multiplier = pickle.load(open('issue_specific_tbip/twitter/topic_ind_to_ideological_multiplier.pkl', 'rb'))\n",
    "print(len(topic_ind_to_ideological_multiplier))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "a3d9bd13-88d3-4c0c-9817-93a570bacfa6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 1, -1, -1, -1, -1, -1, 1, -1, 1, 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, -1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, -1, -1]\n"
     ]
    }
   ],
   "source": [
    "multiplier = [1 for _ in range(ideological_topic_loc_speeches.shape[0])]\n",
    "for ind in topic_ind_to_ideological_multiplier:\n",
    "    multiplier[ind] = topic_ind_to_ideological_multiplier[ind]\n",
    "print(multiplier)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "42d782df-679f-416a-ad4c-866c4fa82d31",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "50"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(multiplier)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "c1dea9b3-ebfa-45f9-9d54-7ca4709f8c26",
   "metadata": {},
   "outputs": [],
   "source": [
    "ideological_topic_loc_tweets = ideological_topic_loc_tweets * np.array(multiplier)[:, np.newaxis]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "4cc097fe-2341-4229-b4b7-8d2ae38dfb8c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(50, 9343)\n",
      "(50, 9343)\n"
     ]
    }
   ],
   "source": [
    "ideological_topic_tweets_minus1 = get_ideological_topics(objective_topic_loc_tweets, \n",
    "                                                           objective_topic_scale_tweets,\n",
    "                                                           ideological_topic_loc_tweets, \n",
    "                                                           ideological_topic_scale_tweets,\n",
    "                                                           -1.0)\n",
    "print(ideological_topic_tweets_minus1.shape)\n",
    "ideological_topic_tweets_plus1 = get_ideological_topics(objective_topic_loc_tweets, \n",
    "                                                           objective_topic_scale_tweets,\n",
    "                                                           ideological_topic_loc_tweets, \n",
    "                                                           ideological_topic_scale_tweets,\n",
    "                                                           1.0)\n",
    "print(ideological_topic_tweets_plus1.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "a92fb254-6977-4f7b-b129-dd0e030e72d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "vocabulary_tweets = list(vocabulary_tweets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "f35bdd58-9fb6-4dfb-80b7-638096c0fe4f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(50, 9343)\n",
      "(50, 9343)\n"
     ]
    }
   ],
   "source": [
    "# rescaling to make them probs \n",
    "ideological_topic_tweets_minus1 = rescale_to_probs_renorm(ideological_topic_tweets_minus1)\n",
    "print(ideological_topic_tweets_minus1.shape)\n",
    "\n",
    "ideological_topic_tweets_plus1 = rescale_to_probs_renorm(ideological_topic_tweets_plus1)\n",
    "print(ideological_topic_tweets_plus1.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "010b0071-43df-4f7e-adb0-3871b7c3c7af",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 50 entries, 0 to 49\n",
      "Data columns (total 10 columns):\n",
      " #   Column                Non-Null Count  Dtype  \n",
      "---  ------                --------------  -----  \n",
      " 0   Topic                 50 non-null     object \n",
      " 1   Name 1                50 non-null     object \n",
      " 2   Description 1         50 non-null     object \n",
      " 3   Notes 1               50 non-null     object \n",
      " 4   Name 2                50 non-null     object \n",
      " 5   Description 2         38 non-null     object \n",
      " 6   Notes 2               22 non-null     object \n",
      " 7   Unnamed: 7            0 non-null      float64\n",
      " 8   Consensus Topic Name  50 non-null     object \n",
      " 9   Notes/Comments        16 non-null     object \n",
      "dtypes: float64(1), object(9)\n",
      "memory usage: 4.0+ KB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "consensus_labels_tweet = pd.read_excel('venue_diff_polsci/consensus_topic_labeling_files/results/tweets_consensus_labeling.xlsx',\n",
    "                                       sheet_name=None,\n",
    "                                        engine='openpyxl')['Sheet1']\n",
    "print(consensus_labels_tweet.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "76eb9a9c-ff5a-4033-8780-7c20a700bec9",
   "metadata": {},
   "outputs": [],
   "source": [
    "tweet_topics_to_labels = {}\n",
    "tweet_c_ts = list(consensus_labels_tweet['Topic'])\n",
    "tweet_c_labels = list(consensus_labels_tweet['Consensus Topic Name'])\n",
    "for t, l in zip(tweet_c_ts, tweet_c_labels):\n",
    "    tweet_topics_to_labels[int(t.split()[1]) - 1] = l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "cd93de85-d9ce-4513-a74a-9f43cd46ce5f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "40\n"
     ]
    }
   ],
   "source": [
    "final_selected_tweet_topic_labels = [tweet_topics_to_labels[t] for t in final_selected_tweet_topic_inds]\n",
    "print(len(final_selected_tweet_topic_labels))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "25743aa4-b562-4f4b-97a4-42d742e86b01",
   "metadata": {},
   "outputs": [],
   "source": [
    "topic_ind_to_a_b_info_tweet = create_topics_file_for_human_labeling_and_rating(ideological_topic_tweets_minus1,\n",
    "                                                 ideological_topic_tweets_plus1,\n",
    "                                                 vocabulary_tweets,\n",
    "                                                 '/workspace/pranav/tbip/analysis/human_annotation_files/issue_specific_tbip/twitter/post_tbip_annotation/',\n",
    "                                                 final_selected_tweet_topic_inds,\n",
    "                                                 final_selected_tweet_topic_labels\n",
    "                                                )\n",
    "pickle.dump(topic_ind_to_a_b_info_tweet,\n",
    "    open('issue_specific_tbip/twitter/topic_ind_to_a_b_info_post_tbip.pkl', \n",
    "                 'wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c7b42948-4123-40f1-b2be-17364bdbdceb",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e2f4c418-723f-4c98-86d8-ac6677b6aad8",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e2e4f7a6-ab77-49a0-9aaf-de4feda1a73c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "id": "54ae8c1a-1fc8-4589-beaf-25e61d100688",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "42\n"
     ]
    }
   ],
   "source": [
    "selected_speech_topics_after_discarding_based_on_labels = [t for t in speech_topics_to_labels if speech_topics_to_labels[t] not in labels_to_discard]\n",
    "print(len(selected_speech_topics_after_discarding_based_on_labels))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "id": "5c6c41f9-8a8a-4254-82c0-f94dbfb21842",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "40\n"
     ]
    }
   ],
   "source": [
    "selected_tweet_topics_after_discarding_based_on_labels = [t for t in tweet_topics_to_labels if tweet_topics_to_labels[t] not in labels_to_discard]\n",
    "print(len(selected_tweet_topics_after_discarding_based_on_labels))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "id": "bbcc5fae-5c09-47a8-bef9-17271640aa4d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_dataframe_from_annotated_xlsx_file_path(path):\n",
    "    df = pd.read_excel(path,\n",
    "                       sheet_name=None,\n",
    "                       engine='openpyxl')\n",
    "    df = df['Sheet1']\n",
    "    df = df[~pd.isnull(df['Topic Name'])]\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "id": "1953084d-3183-482d-b911-9f1e1b26d696",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/workspace/.conda/envs/tbip/lib/python3.6/site-packages/openpyxl/worksheet/_reader.py:312: UserWarning: Unknown extension is not supported and will be removed\n",
      "  warn(msg)\n",
      "/workspace/.conda/envs/tbip/lib/python3.6/site-packages/openpyxl/worksheet/_reader.py:312: UserWarning: Conditional Formatting extension is not supported and will be removed\n",
      "  warn(msg)\n"
     ]
    }
   ],
   "source": [
    "annotator1_speeches = get_dataframe_from_annotated_xlsx_file_path('venue_diff_polsci/pre_tbip_annotation_results/annotator_1/topics_for_annotation.xlsx_frazier_speech.xlsx')\n",
    "annotator2_speeches = get_dataframe_from_annotated_xlsx_file_path('venue_diff_polsci/pre_tbip_annotation_results/annotator_2/topics_for_annotation_Hightower_speech.xlsx')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "id": "2250e007-0016-4cad-a6e2-1cf7c3d659d3",
   "metadata": {},
   "outputs": [],
   "source": [
    "speeches_topic_to_coherence1 = {}\n",
    "speech_ts1 = list(annotator1_speeches['Topic'])\n",
    "speech_coherence1 = list(annotator1_speeches['Coherence'])\n",
    "for t, l in zip(speech_ts1, speech_coherence1):\n",
    "    speeches_topic_to_coherence1[int(t.split()[1]) - 1] = l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "id": "10d63e60-5244-4a86-b005-86307bee6b78",
   "metadata": {},
   "outputs": [],
   "source": [
    "discard_based_on_coherence_speeches1 = [t for t in speeches_topic_to_coherence1 if speeches_topic_to_coherence1[t] == 1.0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "id": "3378123e-3bf9-41c9-97fb-1113934f798b",
   "metadata": {},
   "outputs": [],
   "source": [
    "speeches_topic_to_coherence2 = {}\n",
    "speech_ts2 = list(annotator2_speeches['Topic'])\n",
    "speech_coherence2 = list(annotator2_speeches['Coherence'])\n",
    "for t, l in zip(speech_ts2, speech_coherence2):\n",
    "    speeches_topic_to_coherence2[int(t.split()[1]) - 1] = l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "id": "16ebe496-bdf4-4f0f-bcc8-3de4aa6a1a1e",
   "metadata": {},
   "outputs": [],
   "source": [
    "discard_based_on_coherence_speeches2 = [t for t in speeches_topic_to_coherence2 if speeches_topic_to_coherence2[t] == 1.0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "id": "11331211-16b0-4c33-aa95-20b3b15ce958",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(15, 'Annotator 2 rated coherence as 1.0')\n",
      "International Human Rights\n",
      "(31, 'Annotator 2 rated coherence as 1.0')\n",
      "Support for Legislation\n",
      "(35, 'Annotator 2 rated coherence as 1.0')\n",
      "Elections and Appointments\n",
      "(46, 'Annotator 2 rated coherence as 1.0')\n",
      "Funding\n"
     ]
    }
   ],
   "source": [
    "topics_to_consider_removing_speeches1 = []\n",
    "for t in selected_speech_topics_after_discarding_based_on_labels:\n",
    "    if t in discard_based_on_coherence_speeches1:\n",
    "        print((t, 'Annotator 1 rated coherence as 1.0'))\n",
    "        print(speech_topics_to_labels[t])\n",
    "    if t in discard_based_on_coherence_speeches2:\n",
    "        print((t, 'Annotator 2 rated coherence as 1.0'))\n",
    "        print(speech_topics_to_labels[t])\n",
    "    #print('---')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "id": "0062ef0d-6d1c-4fe8-b3b0-ff3abc00bd5a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[16]\n"
     ]
    }
   ],
   "source": [
    "topics_to_dicard_based_on_coherence_score_speeches = list(set(discard_based_on_coherence_speeches1).intersection(set(discard_based_on_coherence_speeches2)))\n",
    "print(topics_to_dicard_based_on_coherence_score_speeches)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f5151617-f1b6-4f75-9538-4ee034424fc4",
   "metadata": {},
   "source": [
    "### Current strategy to form finalized list of topics included for post-tbip annotation\n",
    "1. Discarding some on consensul labels - DISCARD, and some non-substantive-issue ones like procedural, congratulatory, etc.\n",
    "2. Discarding those that were rated 1 for coherence by BOTH annotators (should we discard those rated as 1 for coherence by 1/2 annotators?)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "id": "79974cac-6957-4968-a697-3a39ce729b00",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "42\n"
     ]
    }
   ],
   "source": [
    "final_selected_speech_topic_inds = [t for t in selected_speech_topics_after_discarding_based_on_labels if t not in topics_to_dicard_based_on_coherence_score_speeches]\n",
    "print(len(final_selected_speech_topic_inds))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "id": "3b9d413f-2ad0-4ddc-bc45-cf437863b51f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "42\n"
     ]
    }
   ],
   "source": [
    "final_selected_speech_topic_labels = [speech_topics_to_labels[t] for t in final_selected_speech_topic_inds]\n",
    "print(len(final_selected_speech_topic_labels))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "97b43849-3470-43bb-8946-bbd60f616d37",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "id": "0e33bbca-51f8-4487-a370-1bc95944328e",
   "metadata": {},
   "outputs": [],
   "source": [
    "annotator1_tweets = get_dataframe_from_annotated_xlsx_file_path('venue_diff_polsci/pre_tbip_annotation_results/annotator_1/topics_for_annotation.xlsx_Frazier_tweet.xlsx')\n",
    "annotator2_tweets = get_dataframe_from_annotated_xlsx_file_path('venue_diff_polsci/pre_tbip_annotation_results/annotator_2/topics_for_annotation_Hightower_tweets.xlsx')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "id": "e0550829-b13d-4088-97d6-aac5e13623b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "tweet_topic_to_coherence1 = {}\n",
    "tweet_ts1 = list(annotator1_tweets['Topic'])\n",
    "tweet_coherence1 = list(annotator1_tweets['Coherence'])\n",
    "for t, l in zip(tweet_ts1, tweet_coherence1):\n",
    "    tweet_topic_to_coherence1[int(t.split()[1])] = l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "id": "105c024f-19ed-4ab1-980a-9c8f0841aad1",
   "metadata": {},
   "outputs": [],
   "source": [
    "discard_based_on_coherence_tweets1 = [t for t in tweet_topic_to_coherence1 if tweet_topic_to_coherence1[t] == 1.0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "id": "b831ea53-34a0-4894-97c1-c3fc7118c69b",
   "metadata": {},
   "outputs": [],
   "source": [
    "tweet_topic_to_coherence2 = {}\n",
    "tweet_ts2 = list(annotator2_tweets['Topic'])\n",
    "tweet_coherence2 = list(annotator2_tweets['Coherence'])\n",
    "for t, l in zip(tweet_ts2, tweet_coherence2):\n",
    "    tweet_topic_to_coherence2[int(t.split()[1])] = l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "id": "b17f45d3-6ec7-4a11-a952-f5896faa289b",
   "metadata": {},
   "outputs": [],
   "source": [
    "discard_based_on_coherence_tweets2 = [t for t in tweet_topic_to_coherence2 if tweet_topic_to_coherence2[t] == 1.0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "id": "fddeee08-eb28-4f7d-86d9-3b13f6b52b39",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(12, 'Annotator 2 rated coherence as 1.0')\n",
      "Promoting Unity\n",
      "(19, 'Annotator 2 rated coherence as 1.0')\n",
      "Legislation\n",
      "(21, 'Annotator 2 rated coherence as 1.0')\n",
      "Public Health\n",
      "(24, 'Annotator 2 rated coherence as 1.0')\n",
      "Job Market\n",
      "(25, 'Annotator 2 rated coherence as 1.0')\n",
      "Registration\n",
      "(26, 'Annotator 2 rated coherence as 1.0')\n",
      "Civil Rights of Gender and Sexuality\n",
      "(37, 'Annotator 2 rated coherence as 1.0')\n",
      "Job Creation\n",
      "(40, 'Annotator 2 rated coherence as 1.0')\n",
      "Border Wall\n"
     ]
    }
   ],
   "source": [
    "#topics_to_consider_removing_speeches1 = []\n",
    "for t in selected_tweet_topics_after_discarding_based_on_labels:\n",
    "    if t in discard_based_on_coherence_tweets1:\n",
    "        print((t, 'Annotator 1 rated coherence as 1.0'))\n",
    "        print(tweet_topics_to_labels[t])\n",
    "    if t in discard_based_on_coherence_tweets2:\n",
    "        print((t, 'Annotator 2 rated coherence as 1.0'))\n",
    "        print(tweet_topics_to_labels[t])\n",
    "    #print('---')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "id": "4fa9d2d2-bc53-42c0-a0a2-bfd953c94f80",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[]\n"
     ]
    }
   ],
   "source": [
    "topics_to_dicard_based_on_coherence_score_tweets = list(set(discard_based_on_coherence_tweets1).intersection(set(discard_based_on_coherence_tweets2)))\n",
    "print(topics_to_dicard_based_on_coherence_score_tweets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "id": "800508f3-3610-4636-a5d9-6c909abdb470",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "40\n"
     ]
    }
   ],
   "source": [
    "final_selected_tweet_topic_inds = [t for t in selected_tweet_topics_after_discarding_based_on_labels if t not in topics_to_dicard_based_on_coherence_score_tweets]\n",
    "print(len(final_selected_tweet_topic_inds))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "id": "6408a991-9ab3-4a0c-b2d5-bc306abd6c62",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "40\n"
     ]
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "86872ab5-f8a4-4e04-b5d2-0827adea6fc6",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "id": "8a963c3c-f72b-4627-bd67-b1f3d2b88d04",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 157,
   "id": "f44c0828-28ee-4427-984f-e6105bf0de23",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['a']"
      ]
     },
     "execution_count": 157,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "random.Random(42).sample(['a', 'b'], 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "id": "a34d58ab-149c-4b96-b050-6568a8eccb28",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "id": "f10d494b-6023-42e1-af39-8356fa18d6e6",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "id": "c98adf8c-8d64-4609-b5f0-a8d0f8ed16ab",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "id": "b383d6f6-3135-40b9-9af3-c19c7ba4c8b1",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "id": "4ac7abae-50eb-4335-ba69-ad30b9e866e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "789c457c-84f7-48ea-93c7-6258c7f56f40",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:tbip] *",
   "language": "python",
   "name": "conda-env-tbip-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
